import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import plotly.express as px
from urllib.request import urlopen
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#loading in wildfire data
wildfire_raw = pd.read_csv('California_Fire_Incidents.csv')
# Dropping mostly NaN populated columns and columns with all the same value.
wildfire_raw.drop(['AirTankers', 'ConditionStatement', 'ControlStatement','CrewsInvolved', 'CountyIds', 'Dozers',
'Engines', 'Fatalities', 'FuelType', 'Helicopters', 'Injuries', 'PersonnelInvolved',
'StructuresEvacuated', 'StructuresDamaged', 'StructuresDestroyed', 'StructuresThreatened',
'WaterTenders', 'Active', 'CanonicalUrl', 'Status', 'Updated', 'Final', 'PercentContained',
'Public', 'Latitude', 'Longitude', 'Featured', 'Location', 'Name',
'SearchDescription', 'SearchKeywords', 'UniqueId','AdminUnit'], axis=1, inplace=True)
wildfire_raw.head()
| AcresBurned | ArchiveYear | CalFireIncident | Counties | Extinguished | MajorIncident | Started | |
|---|---|---|---|---|---|---|---|
| 0 | 257314.0 | 2013 | True | Tuolumne | 2013-09-06T18:30:00Z | False | 2013-08-17T15:25:00Z |
| 1 | 30274.0 | 2013 | True | Los Angeles | 2013-06-08T18:30:00Z | False | 2013-05-30T15:28:00Z |
| 2 | 27531.0 | 2013 | True | Riverside | 2013-07-30T18:00:00Z | False | 2013-07-15T13:43:00Z |
| 3 | 27440.0 | 2013 | False | Placer | 2013-08-30T08:00:00Z | False | 2013-08-10T16:30:00Z |
| 4 | 24251.0 | 2013 | True | Ventura | 2013-05-11T06:30:00Z | True | 2013-05-02T07:01:00Z |
AcresBurned (3 NaN values) -- drop NaN columns
Extinguished (59 NaN values) -- EDA has told us it may be best to drop those rows
Counties (4 incorrect values) -- replace 'Mexico' with 'San Diego,' replace 'State of Oregon' with 'Siskiyou,' replace 'State of Nevada' with 'Nevada' (it's a California County)
# AcresBurned
# manually changing missing records based on quick search of record
# 614 - 1,324 acres
# 1045 - 210 acres
# 1052 - 5000 acres
# 1367 - 100 acres
wildfire_raw.loc[614, 'AcresBurned'] = 1324.0
wildfire_raw.loc[1045, 'AcresBurned'] = 210.0
wildfire_raw.loc[1052, 'AcresBurned'] = 5000.0
wildfire_raw.loc[1367, 'AcresBurned'] = 100.0
wildfire_raw.dropna(subset=['AcresBurned'], inplace=True)
# Counties
wildfire_raw.loc[1423, 'Counties'] = 'Siskiyou'
wildfire_raw.loc[1424, 'Counties'] = 'Nevada'
wildfire_raw.loc[1421, 'Counties'] = 'San Diego'
wildfire_raw.loc[1590, 'Counties'] = 'San Diego'
# Function to convert the Started and Extinguished dates to datetime objects
def to_datetime(date):
if type(date) != str and math.isnan(date):
return date
date_only = str(date)[:10] # ensuring that the date is a string + omitting the time
return datetime.strptime(date_only, '%Y-%m-%d')
# Extinguished
wildfire_raw['StartedDate'] = wildfire_raw['Started'].apply(to_datetime)
wildfire_raw['ExtinguishedDate'] = wildfire_raw['Extinguished'].apply(to_datetime)
wildfire_raw.loc[1019, 'StartedDate'] = to_datetime('2017-05-19')
wildfire_raw.loc[1261, 'StartedDate'] = to_datetime('2018-08-08')
null_extinguished = wildfire_raw[wildfire_raw['Extinguished'].isna()] # all the rows that need the extinguished column filled
notnull_extinguished = wildfire_raw[wildfire_raw['Extinguished'].notnull()] # model uses this to predict burn duration from acres burned
notnull_extinguished['BurnDuration'] = (wildfire_raw['ExtinguishedDate'] - wildfire_raw['StartedDate']) #timestamp object
notnull_extinguished['BurnDuration'] = [timestamp.days for timestamp in notnull_extinguished['BurnDuration']] #converting to int # days
notnull_extinguished = notnull_extinguished[notnull_extinguished['BurnDuration'] > 0] #getting rid of incorrect extinguished dates
acres_burned, burn_duration = notnull_extinguished['AcresBurned'], notnull_extinguished['BurnDuration']
plt.scatter(acres_burned, burn_duration, alpha=0.25, color='darkorange');
plt.ylim(0, 300);
plt.xlim(0, 4000);
plt.ylabel('Burn Duration (Days)');
plt.xlabel('Burned Area (Acres)');
plt.title('Training Data: Burn Duration vs. Acres Burned')
plt.savefig('burneduration_vs_area')
#this plot is a little concerning for the performance of the model to fill the null burn durations (no notable trends)
# Linear Regression
x_pred = null_extinguished['AcresBurned'].to_numpy().reshape((-1, 1))
x = acres_burned.to_numpy().reshape((-1, 1))
y = burn_duration.to_numpy().reshape((-1, 1))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
burn_duration_model = LinearRegression()
burn_duration_model.fit(x_train, y_train)
y_test_pred = burn_duration_model.predict(x_test)
plt.hist(y_test_pred - y_test, color='firebrick', edgecolor='black', bins=range(-325, 151, 50))
plt.title('(Predicted Burn Duration - True Burn Duration) Distribution')
plt.xlabel('Difference (# Days)')
#train the model on all of the data, predict missing values:
burn_duration_model.fit(x, y)
y_pred = burn_duration_model.predict(x_pred)
plt.scatter(x_test, y_test, color="darkorange", alpha = 0.25)
plt.plot(x_test, y_test_pred, color="blue")
plt.xlim(0,100000)
plt.xlabel('Burned Area (Acres)');
plt.ylabel('Burn Duration (Days)');
plt.title('Predicted: Burn Duration vs. Acres Burned');
plt.ylim(0, 300);
plt.xlim(0, 4000);
print('Regression Coefficient: ' + str(burn_duration_model.coef_))
Regression Coefficient: [[0.00026496]]
TLDR: burn duration is not strongly related to acres burned, therefore we're dropping the missing extinguished date rows (only 59 rows out of the 1600+)
wildfire_raw.dropna(subset=['Extinguished'], inplace=True)
Adding in average population data per county
Adding in square mileage per county
Adding in climate data per county
# population size per county, per year
population_raw = pd.read_excel('co-est2019-annres-06.xlsx', header = 3).rename(columns = {'Unnamed: 0': "County"}).iloc[:59, :]
population_raw['County'] = population_raw['County'].iloc[:].str.extract(r'\.(.*?) County, California')
population_raw = population_raw.iloc[1:, :]
population_raw = population_raw.add_suffix('_population')
population_raw.rename(columns = {'County_population': 'county_name'}, inplace=True)
# averaging population across years
population_raw['mean_population'] = population_raw.iloc[:, 3:].mean(axis=1)
population_raw.drop(['2010_population', '2011_population', '2012_population',
'2013_population', '2014_population', '2015_population',
'2016_population', '2017_population', '2018_population',
'2019_population', 'Census_population', 'Estimates Base_population'], axis=1, inplace=True)
population_raw.head()
| county_name | mean_population | |
|---|---|---|
| 1 | Alameda | 1606881.9 |
| 2 | Alpine | 1102.5 |
| 3 | Amador | 37802.5 |
| 4 | Butte | 223503.3 |
| 5 | Calaveras | 45234.0 |
# square mileage per county
land_raw = pd.read_excel('land area.xlsx')[['Areaname', 'square miles']]
land_raw = land_raw[land_raw['Areaname'].str.contains(r', CA$')]
ca_land_raw = pd.DataFrame({})
ca_land_raw['county_name'] = land_raw['Areaname'].str.extract(r'^(.+?),')
ca_land_raw['square_miles'] = land_raw['square miles']
ca_land_raw.head()
| county_name | square_miles | |
|---|---|---|
| 192 | Alameda | 83.57 |
| 193 | Alpine | 4.57 |
| 194 | Amador | 11.73 |
| 195 | Butte | 37.62 |
| 196 | Calaveras | 16.81 |
# merging population and land area data
pop_and_area = population_raw.merge(ca_land_raw, left_on = 'county_name', right_on = 'county_name')
pop_and_area.head()
| county_name | mean_population | square_miles | |
|---|---|---|---|
| 0 | Alameda | 1606881.9 | 83.57 |
| 1 | Alpine | 1102.5 | 4.57 |
| 2 | Amador | 37802.5 | 11.73 |
| 3 | Butte | 223503.3 | 37.62 |
| 4 | Calaveras | 45234.0 | 16.81 |
wildfire_raw = wildfire_raw.merge(pop_and_area, how = 'left', left_on = 'Counties', right_on = 'county_name').drop('county_name', axis=1)
wildfire_raw.head()
| AcresBurned | ArchiveYear | CalFireIncident | Counties | Extinguished | MajorIncident | Started | StartedDate | ExtinguishedDate | mean_population | square_miles | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 257314.0 | 2013 | True | Tuolumne | 2013-09-06T18:30:00Z | False | 2013-08-17T15:25:00Z | 2013-08-17 | 2013-09-06 | 54231.5 | 38.93 |
| 1 | 30274.0 | 2013 | True | Los Angeles | 2013-06-08T18:30:00Z | False | 2013-05-30T15:28:00Z | 2013-05-30 | 2013-06-08 | 10007550.7 | 691.45 |
| 2 | 27531.0 | 2013 | True | Riverside | 2013-07-30T18:00:00Z | False | 2013-07-15T13:43:00Z | 2013-07-15 | 2013-07-30 | 2335696.3 | 95.76 |
| 3 | 27440.0 | 2013 | False | Placer | 2013-08-30T08:00:00Z | False | 2013-08-10T16:30:00Z | 2013-08-10 | 2013-08-30 | 372878.5 | 98.41 |
| 4 | 24251.0 | 2013 | True | Ventura | 2013-05-11T06:30:00Z | True | 2013-05-02T07:01:00Z | 2013-05-02 | 2013-05-11 | 840546.2 | 362.90 |
# TODO: average climate data per county
temp = pd.read_csv('california_temp.csv')
temp_2 = temp.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
temp_2.reset_index(inplace = True)
temp_2.drop([11, 12],axis = 1, inplace = True)
temp_2 = temp_2.add_suffix("_temp").sort_values(['County Name_temp', 'year_temp'])
#fill na values for temp
temp_2 = temp_2.interpolate()
temp_2 = temp_2.bfill()
wind = pd.read_csv('california_wind.csv')
wind_2 = wind.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
wind_2.reset_index(inplace = True)
wind_2.drop([11, 12], axis = 1, inplace = True)
wind_2 = wind_2.add_suffix("_wind")
#fill na values for wind
wind_2 = wind_2.interpolate()
rh = pd.read_csv('california_rh.csv')
rh_2 = rh.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
rh_2.reset_index(inplace = True)
rh_2.drop([11, 12], axis = 1, inplace = True)
rh_2 = rh_2.add_suffix("_rh")
#fill na values for relative humidity
rh_2 = rh_2.interpolate()
rh_2 = rh_2.bfill()
#join wildifre data and temp
wildfire = wildfire_raw.merge(temp_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_temp', 'County Name_temp'])
#join wildifre data and wind
wildfire = wildfire.merge(wind_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_wind', 'County Name_wind'])
#join wildifre data and rh
wildfire = wildfire.merge(rh_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_rh', 'County Name_rh'])
wildfire.drop(['County Name_temp', 'year_temp', 'County Name_wind', 'year_wind', 'County Name_rh', 'year_rh', 'Extinguished', 'Started'],
inplace=True, axis=1)
wildfire = wildfire.sort_values(['Counties', 'ArchiveYear'])
#back fill na
wildfire = wildfire.bfill()
#forward fill na
wildfire = wildfire.ffill()
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
fips_to_state = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv', dtype={"FIPS": str})
fips_to_ca = fips_to_state[fips_to_state['STNAME'] == 'California'][['FIPS', 'CTYNAME']]
fips_to_ca['CTYNAME'] = fips_to_ca['CTYNAME'].str.extract(r'(.*?) County')
geo_df0 = wildfire[['Counties']].merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').groupby(['Counties', 'FIPS']).count().rename(mapper = {'CTYNAME': 'count'}, axis = 1).reset_index()
fig = px.choropleth(geo_df0, geojson = counties, locations = 'FIPS', color = 'count', scope='usa',
color_continuous_scale="Hot", hover_data = ['Counties'],
labels={'count':'Total Fires'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df00 = wildfire[['Counties', 'AcresBurned']].groupby('Counties').sum().reset_index()
geo_df00 = geo_df00.merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)
fig = px.choropleth(geo_df00, geojson = counties, locations = 'FIPS', color = 'AcresBurned', scope='usa',
color_continuous_scale="Hot", hover_data = ['Counties', 'AcresBurned'],
labels={'AcresBurned':'Total Acres Burned'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df000 = geo_df0.merge(geo_df00, on = ['FIPS', 'Counties'])
geo_df000['AvgAcresPerFire'] = geo_df000['AcresBurned']/geo_df000['count']
fig = px.choropleth(geo_df000, geojson = counties, locations = 'FIPS', color = 'AvgAcresPerFire', scope='usa',
color_continuous_scale="Hot", hover_data = ['Counties', 'AcresBurned', 'count'],
labels={'AvgAcresPerFire':'Average Acres Burned per Fire'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df1 = wildfire[['Counties', 'mean_population']]
geo_df1 = geo_df1.merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)
fig = px.choropleth(geo_df1, geojson = counties, locations = 'FIPS', color = 'mean_population', scope='usa',
color_continuous_scale="Viridis", hover_data = ['Counties'],
labels={'mean_population':'Population'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df2 = wildfire[['Counties', '6_temp', '7_temp', '8_temp', '9_temp', '10_temp']]
geo_df2['avgtemp'] = geo_df2.loc[:, '6_temp':'10_temp'].mean(axis=1)
geo_df2 = geo_df2.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)
fig = px.choropleth(geo_df2, geojson = counties, locations = 'FIPS', color = 'avgtemp', scope='usa',
color_continuous_scale="Viridis", hover_data = ['Counties'],
labels={'avgtemp':'Average summer temperature across years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df3 = wildfire[['Counties', '6_wind', '7_wind', '8_wind', '9_wind', '10_wind']]
geo_df3['avgwind'] = geo_df3.loc[:, '6_wind':'10_wind'].mean(axis=1)
geo_df3 = geo_df3.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)
fig = px.choropleth(geo_df3, geojson = counties, locations = 'FIPS', color = 'avgwind', scope='usa',
color_continuous_scale="Viridis", hover_data = ['Counties'],
labels={'avgwind':'Average summer wind across years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
geo_df4 = wildfire[['Counties', '6_rh', '7_rh', '8_rh', '9_rh', '10_rh']]
geo_df4['avgrh'] = geo_df4.loc[:, '6_rh':'10_rh'].mean(axis=1)
geo_df4 = geo_df4.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)
fig = px.choropleth(geo_df4, geojson = counties, locations = 'FIPS', color = 'avgrh', scope='usa',
color_continuous_scale="Viridis", hover_data = ['Counties'],
labels={'avgrh':'Average Relative Humidity across months and years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
def severity_class(acres_burned):
# Class A - one-fourth acre or less;
# Class B - more than one-fourth acre, but less than 10 acres;
# Class C - 10 acres or more, but less than 100 acres;
# Class D - 100 acres or more, but less than 300 acres;
# Class E - 300 acres or more, but less than 1,000 acres;
# Class F - 1,000 acres or more, but less than 5,000 acres;
# Class G - 5,000 acres or more.
if acres_burned <= 0.25:
return "A"
elif acres_burned < 10:
return "B"
elif acres_burned < 100:
return "C"
elif acres_burned < 300:
return "D"
elif acres_burned < 1000:
return "E"
elif acres_burned < 5000:
return "F"
else:
return "G"
wildfire['Class'] = wildfire['AcresBurned'].apply(severity_class)
num_classA = len(wildfire[wildfire['Class'] == 'A'])
num_classB = len(wildfire[wildfire['Class'] == 'B'])
print('Number of Class A and Class B fires: ' + str(num_classA + num_classB))
Number of Class A and Class B fires: 27
wildfire = wildfire[(wildfire['Class'] != 'A') & (wildfire['Class'] != 'B')] #dropping class A and B fires
wildfire['NumClass'] = wildfire['Class'].replace({'C': 0, 'D': 1, 'E': 2, 'F': 3, 'G': 4, 'A': 5, 'B': 6})
wildfire['NumClass']
54 1
70 1
120 0
139 0
298 3
..
949 0
1037 0
1490 0
1491 0
1494 0
Name: NumClass, Length: 1550, dtype: int64
# counts of the classes (before combining Class A and B fires)
class_counts = wildfire.groupby('Class').count()
class_counts_dict = class_counts.to_dict()['AcresBurned']
plt.bar(class_counts_dict.keys(), class_counts_dict.values(), edgecolor='black');
plt.title('2013-2020 Wildfire Class Counts');
plt.xlabel('Class');
plt.ylabel('# Wildfires');
temp_wildfire = wildfire[['StartedDate', 'ExtinguishedDate']]
temp_wildfire['BurnDuration'] = (wildfire['ExtinguishedDate'] - wildfire['StartedDate']) #timestamp object
temp_wildfire['BurnDuration'] = [timestamp.days for timestamp in temp_wildfire['BurnDuration']] #converting to int # days
temp_wildfire['StartedDOY'] = temp_wildfire['StartedDate'].apply(lambda x: x.timetuple().tm_yday)
temp_wildfire['ExtinguishedDOY'] = temp_wildfire['ExtinguishedDate'].apply(lambda x: x.timetuple().tm_yday)
temp_wildfire.drop(['StartedDate', 'ExtinguishedDate'],axis=1, inplace=True)
X = wildfire.drop(['StartedDate', 'ExtinguishedDate', 'AcresBurned', 'Class', 'NumClass', 'Counties'], axis=1)
X['StartedDOY'] = temp_wildfire['StartedDOY']
X['ExtinguishedDOY'] = temp_wildfire['ExtinguishedDOY']
X['BurnDuration'] = temp_wildfire['BurnDuration']
y = wildfire[['NumClass']]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=.30, random_state=42, stratify=wildfire['NumClass'])
clf = RandomForestClassifier(max_depth=8, random_state = 42)
clf.fit(train_X, train_y)
RandomForestClassifier(max_depth=8, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=8, random_state=42)
num_correct_train = sum(clf.predict(train_X) == train_y['NumClass'].values)
num_predictions_train = len(train_y)
print("Model Train Accuracy: " + str(num_correct_train / num_predictions_train))
Model Train Accuracy: 0.7576036866359447
clf_pred_y = clf.predict(test_X)
num_correct_test = sum(clf_pred_y == test_y['NumClass'].values)
num_predictions_test = len(test_y)
print("Model Test Accuracy: " + str(num_correct_test / num_predictions_test))
Model Test Accuracy: 0.5010752688172043
len(train_y[train_y['NumClass'] == 0]['NumClass']) / len(train_y) #prediction accuracy if we predicted majority class only
0.4875576036866359
y = clf.feature_importances_
fig, ax = plt.subplots()
width = 0.4 # the width of the bars
ind = np.arange(len(y)) # the x locations for the groups
ax.barh(ind, y, width, color='green')
ax.set_yticks(ind+width/10)
ax.set_yticklabels(X.columns, minor=False)
plt.title('Feature importance in RandomForest Classifier')
plt.xlabel('Relative importance')
plt.ylabel('feature')
plt.figure(figsize=(20,20))
fig.set_size_inches(10, 10, forward=True)
<Figure size 1440x1440 with 0 Axes>
test_results = test_y.copy()
test_results = test_results.rename(columns={'NumClass': 'True'})
test_results['Predicted'] = clf_pred_y
test_results['counter'] = 1
test_results.head()
| True | Predicted | counter | |
|---|---|---|---|
| 203 | 2 | 0 | 1 |
| 1434 | 1 | 0 | 1 |
| 105 | 0 | 0 | 1 |
| 72 | 1 | 0 | 1 |
| 655 | 4 | 2 | 1 |
results_pivot = test_results.pivot_table(index='True', columns='Predicted', values='counter', aggfunc='count')
results_pivot = results_pivot.reset_index()
results_pivot['True'] = results_pivot['True'].replace({0:'C', 1:'D', 2:'E', 3:'F', 4:'G'})
results_pivot
| Predicted | True | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|---|
| 0 | C | 197.0 | 16.0 | 5.0 | 4.0 | 5.0 |
| 1 | D | 69.0 | 20.0 | 3.0 | 3.0 | 4.0 |
| 2 | E | 37.0 | 9.0 | 2.0 | 2.0 | NaN |
| 3 | F | 27.0 | 7.0 | 1.0 | 6.0 | 7.0 |
| 4 | G | 19.0 | 3.0 | 5.0 | 6.0 | 8.0 |
results_pivot.plot(x='True', kind='bar', stacked=True,
title='Predicted and True Wildfire Clases')
plt.xlabel('True Class');
plt.ylabel('Count');
plt.legend(labels=['C', 'D', 'E', 'F', 'G'], title='Predicted Class');
plt.xticks(rotation=0);
test_results['TrueClass'] = test_results['True'].replace({0:'C', 1:'D', 2:'E', 3:'F', 4:'G'})
test_results['CorrectPrediction'] = test_results['True'] == test_results['Predicted']
correct_pred = test_results[['TrueClass', 'CorrectPrediction']].groupby('TrueClass').agg({"CorrectPrediction": [np.sum, np.ma.count]})
correct_pred['PropCorrect'] = correct_pred['CorrectPrediction']['sum'] / correct_pred['CorrectPrediction']['count']
correct_pred.reset_index().plot('TrueClass', 'PropCorrect', color='maroon', kind='bar',legend=False, title='Correct Prediction Proportions, By Class');
plt.xlabel('Class');
plt.ylabel('Proportion');
plt.xticks(rotation=0);
correct_pred.reset_index()[['TrueClass', 'PropCorrect']]
| TrueClass | PropCorrect | |
|---|---|---|
| 0 | C | 0.867841 |
| 1 | D | 0.202020 |
| 2 | E | 0.040000 |
| 3 | F | 0.125000 |
| 4 | G | 0.195122 |